
clear
set more off
cap log close

******************************************************************************************
* Program: figure2_data.do
* Purpose: Prepare the dataset used to generate Figure 2. 
* Mapping and visualization were conducted separately using Foursquare Studio.
******************************************************************************************

************************************************************************
** set macros (directory)
************************************************************************
global prison_dir "~/Dropbox/Prison_Covid/PNAS_Nexus_Replication"
global raw_data "${prison_dir}/raw_data"
global intermediate_data "${prison_dir}/intermediate_data"
global code "${prison_dir}/code"
global analysis_data "${prison_dir}/analysis_data"
global output "${prison_dir}/output"

******************
* Get CA state prison location 
******************

import delimited "${raw_data}/CDCR_covid_19.csv", clear
sort institutionname
drop if institutionname == institutionname[_n-1]

keep institutionname latitude longitude

export delimited "${output}/CA_prison_location.csv", replace 

******************
* Compute sum of contact from all prisons for a ZCTA 
******************

* read ZCTA files 
use "${intermediate_data}/CA_ZCTA_uniq_list.dta", clear

ren ZCTA ZCTA_contact 

* Form every pairwise combination of ZCTA-prison -> expand data to ZCTA-prison level
cross using "${intermediate_data}/cdcr_population_w_ShapefileID.dta" 

keep ZCTA_contact num_zip NAME institution_name COUNTY COUNTYFIPS prison_zip prison_ZCTA population_felons_201912 designed_capacity_201912 staffed_capacity_201912
ren COUNTY prison_cty
ren COUNTYFIPS prison_cty_FIPS

* join to get all prisons covid case in a month (from 03/2020-12/2020, for 35 prisons) 
joinby NAME using "${intermediate_data}/CDCR_covid_19_for_merged_monthly.dta"
drop totalconfirmed_2020 totaldeaths_2020 distinctpatientstested_2020
rename new_cases prison_new_cases 

* merge ZCTA prison contact matrix 
merge 1:1 ZCTA_contact NAME month using "${smartphone_data}/prison_time_from_Jun-OctHome_ZCTA.dta", ///
	keepusing(hr_pri_fr_home prison_zip prison_ZCTA) keep(master match)
// note: data w/ _merge == 2 includes other 5 facilities (hopistal/transitional) as well as contact in Jan, Feb 

* fill in zeros for the contact matrix 
foreach var of varlist hr_pri_fr_home{
	replace `var' = 0 if _merge == 1 & month >= 3 & month <= 10
}
drop _merge 

* sort and keep variables needed 
sort NAME month ZCTA_contact 
keep NAME month ZCTA_contact num_zip hr_pri_fr_home prison_ZCTA
keep if month == 6

* compute total contact
gegen tot_hour_in_pri_June = total(hr_pri_fr_home), by(ZCTA_contact)

* collapse to ZCTA level
sort ZCTA_contact
drop if ZCTA_contact == ZCTA_contact[_n-1]

keep ZCTA_contact tot_hour_in_pri_June

export delimited "${output}/tot_contact_by_ZCTA.csv", replace 
	
******************
* Prepare data to map Treated vs. Control Zip Codes (i.e. zip codes connected to San Quentin or not)
******************

use "${analysis_data}/San_Quentin_main_analysis_data.dta", clear

***************************************
* get weight under different matching schemes * 
***************************************

drdid ZCTA_max_cases_p100k_rmpri $cubic cum_max_cases_p100k_rmpri5 log_tot_pop_acs_15_19 if inlist(month, 5, 7), ///
	ivar(ZCTA_contact) time(month) tr(treated) dripw
drdid_predict wgt_covid_temp 

drdid ZCTA_max_cases_p100k_rmpri $cubic cum_max_cases_p100k_rmpri5 $demo if inlist(month, 5, 7), ///
	ivar(ZCTA_contact) time(month) tr(treated) dripw
drdid_predict wgt_covid_demo_temp

gegen wgt_covid = max(wgt_covid_temp), by(ZCTA_contact)
gegen wgt_covid_demo = max(wgt_covid_demo_temp), by(ZCTA_contact)
drop *temp

***************************************
* collapse to ZCTA_contact level
***************************************

sort ZCTA_contact 
drop if ZCTA_contact == ZCTA_contact[_n-1]

keep ZCTA_contact treated hr_pri_fr_home6 wgt_covid wgt_covid_demo 

export delimited "${output}/SQ_treated_control_ZCTA_wgt.csv", replace 
  